library("readxl")
library("ggplot2")
library("data.table")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("stringr")
library("tidyr")
library("GGally")
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library("ggplot2")
library("MASS")
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library("corrplot")
## corrplot 0.84 loaded
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data("Boston")
glimpse(Boston)
## Observations: 506
## Variables: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, ...
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5,...
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, ...
## $ chas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524...
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172...
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0,...
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605...
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, ...
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311,...
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, ...
## $ black <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60...
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.9...
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, ...
plot(Boston$med)
# General summary of the dataset
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
# Matrix of the variables
pairs(Boston)
# Correlation matrix
cor_matrix <- cor(Boston) %>% round(2)
corrplot(cor_matrix, method = "circle", type = "upper",
cl.pos = "b", tl.pos = "d", tl.cex = 0.6)
# How do other variables stack up against crime rates? Do we see patterns?
molten <- melt(Boston, id = "crim")
ggplot(molten, aes(x = value, y = crim))+
facet_wrap( ~ variable, scales = "free")+
geom_point()
# Centering and standardizing variables
boston_scaled <- scale(Boston)
# Summaries of the scaled variables
glimpse(boston_scaled)
## num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:506] "1" "2" "3" "4" ...
## ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
summary(boston_scaled)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
# Class of boston_scaled object
class(boston_scaled)
## [1] "matrix"
# Converting to data frame
boston_scaled <- as.data.frame(boston_scaled)
# Summary of the scaled crime rate
summary(Boston$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
# Quantile vector of 'crim'
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
# Categorical variable 'crime' from 'crim'
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, label = c("low", "med_low", "med_high", "high"))
# Tabulation of the new factor crime
table(crime)
## crime
## low med_low med_high high
## 127 126 126 127
# Removing original 'crim' from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)
# Adding the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)
# Number of rows in the Boston dataset
n <- nrow(boston_scaled)
n
## [1] 506
# Choosing randomly 80% of the rows
ind <- sample(n, size = n * 0.8)
ind
## [1] 208 415 338 218 428 29 482 37 419 266 43 459 295 245 163 405 177
## [18] 392 117 125 184 51 73 152 87 329 332 93 457 150 187 194 267 264
## [35] 491 100 416 71 447 417 280 273 265 13 19 148 195 238 309 365 237
## [52] 363 371 403 24 485 66 443 390 357 156 111 137 147 418 474 311 433
## [69] 58 20 109 188 364 448 196 383 478 440 30 136 384 381 263 190 115
## [86] 114 470 389 356 67 349 240 94 75 4 350 437 453 420 120 492 107
## [103] 460 425 489 359 319 90 203 170 336 3 293 191 439 377 85 22 451
## [120] 305 342 268 370 431 395 189 161 108 178 472 449 477 355 461 456 138
## [137] 153 387 223 272 248 41 483 337 116 36 171 301 444 259 32 432 8
## [154] 316 89 121 186 250 217 17 69 229 83 50 454 255 23 231 353 84
## [171] 450 462 16 224 241 143 234 230 318 52 249 312 330 33 55 274 48
## [188] 155 408 306 284 154 123 322 495 469 341 54 422 396 339 476 304 119
## [205] 465 261 298 92 227 345 139 287 160 286 185 290 307 25 289 7 39
## [222] 343 376 2 124 226 506 399 34 315 68 378 64 96 28 333 360 505
## [239] 481 35 220 409 26 104 313 285 308 216 113 63 181 176 144 406 233
## [256] 166 281 210 243 442 367 328 412 327 435 172 81 283 221 65 98 282
## [273] 201 493 379 214 159 101 193 502 21 149 169 213 334 222 74 400 441
## [290] 463 168 86 398 487 484 404 427 302 296 42 503 56 242 165 164 97
## [307] 72 501 388 239 366 112 141 391 11 423 14 225 414 95 46 488 62
## [324] 324 211 310 288 31 297 235 494 232 246 471 346 434 204 200 455 421
## [341] 207 180 76 105 275 452 340 202 413 401 358 15 445 102 91 325 260
## [358] 236 1 197 430 351 354 174 146 145 27 88 82 61 157 183 192 352
## [375] 347 271 361 446 140 127 486 292 158 303 130 331 110 79 205 362 348
## [392] 12 80 490 458 299 10 206 133 410 40 321 380 78
# Training set
train <- boston_scaled[ind,]
# Test set
test <- boston_scaled[-ind,]
# Saving correct classes from test data
correct_classes <- test$crime
# Removing 'crime' variable from test data
test <- dplyr::select(test, -crime)
# Linear discriminant analysis
lda.fit <- lda(crime ~., data = train)
# lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2648515 0.2400990 0.2450495 0.2500000
##
## Group means:
## zn indus chas nox rm
## low 0.94633884 -0.9192809 -0.12514775 -0.8864142 0.41589660
## med_low -0.09979885 -0.2574144 -0.10997442 -0.5542212 -0.09941528
## med_high -0.40061971 0.1430102 0.28443258 0.4267988 0.08342828
## high -0.48724019 1.0171306 -0.03844192 1.0881908 -0.29013270
## age dis rad tax ptratio
## low -0.8878082 0.8916193 -0.6834844 -0.7243406 -0.43359574
## med_low -0.2662448 0.3198635 -0.5544520 -0.4372988 -0.06327037
## med_high 0.4561911 -0.4001977 -0.4215585 -0.3428480 -0.36344874
## high 0.7815714 -0.8247048 1.6379981 1.5139626 0.78062517
## black lstat medv
## low 0.38015350 -0.754983134 0.488969155
## med_low 0.34250737 -0.130474025 0.001028771
## med_high 0.03657157 -0.005407991 0.116145666
## high -0.75672950 0.851872426 -0.736470993
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.099860618 0.59583106 -0.87608198
## indus -0.007291954 -0.20548620 0.36157252
## chas -0.102883183 -0.08015493 -0.04657410
## nox 0.394846881 -0.63302759 -1.30539961
## rm -0.126352380 0.01709332 -0.13604463
## age 0.234733803 -0.47076833 -0.07262002
## dis -0.105636200 -0.11168345 0.22647575
## rad 3.373790383 0.80138044 -0.30859979
## tax 0.042568750 0.14451721 0.75186343
## ptratio 0.149393459 0.03067491 -0.18287981
## black -0.120236528 0.07600376 0.21019962
## lstat 0.205348967 -0.08309739 0.62491027
## medv 0.204385439 -0.35862086 0.09495345
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9523 0.0363 0.0115
# Function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 2)
# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
tab <- table(correct = correct_classes, predicted = lda.pred$class)
tab
## predicted
## correct low med_low med_high high
## low 13 5 2 0
## med_low 10 14 5 0
## med_high 1 7 17 2
## high 0 0 0 26
pred1 <- rbind(tab[1, ]/sum(tab[1, ]), tab[2, ]/sum(tab[2, ]))
pred2 <- rbind(tab[3, ]/sum(tab[3, ]), tab[4, ]/sum(tab[4, ]))
Predict_accuracy <- rbind(pred1, pred2)
rownames(Predict_accuracy) <- colnames(Predict_accuracy)
Predict_accuracy
## low med_low med_high high
## low 0.65000000 0.2500000 0.1000000 0.00000000
## med_low 0.34482759 0.4827586 0.1724138 0.00000000
## med_high 0.03703704 0.2592593 0.6296296 0.07407407
## high 0.00000000 0.0000000 0.0000000 1.00000000
require(caret)
## Loading required package: caret
## Loading required package: lattice
confusionMatrix(correct_classes, lda.pred$class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction low med_low med_high high
## low 13 5 2 0
## med_low 10 14 5 0
## med_high 1 7 17 2
## high 0 0 0 26
##
## Overall Statistics
##
## Accuracy : 0.6863
## 95% CI : (0.5869, 0.7745)
## No Information Rate : 0.2745
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5812
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: low Class: med_low Class: med_high Class: high
## Sensitivity 0.5417 0.5385 0.7083 0.9286
## Specificity 0.9103 0.8026 0.8718 1.0000
## Pos Pred Value 0.6500 0.4828 0.6296 1.0000
## Neg Pred Value 0.8659 0.8356 0.9067 0.9737
## Prevalence 0.2353 0.2549 0.2353 0.2745
## Detection Rate 0.1275 0.1373 0.1667 0.2549
## Detection Prevalence 0.1961 0.2843 0.2647 0.2549
## Balanced Accuracy 0.7260 0.6705 0.7901 0.9643
# Euclidean distance matrix
boston_scaled <- dplyr::select(boston_scaled, -crime)
dist_eu <- dist(boston_scaled)
# Summary of the distances
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.3984 4.7296 4.7597 6.0150 12.5315
# Manhattan distance matrix
dist_man <- dist(boston_scaled, method = 'manhattan')
# Summary of the distances
summary(dist_man)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2645 8.2073 12.0993 12.8752 16.8027 43.5452
# k-means clustering with 4
km4 <-kmeans(boston_scaled, centers = 4)
# plot the Boston dataset with clusters
# For this, we choose 5 variables - dis, medv, black, lstat and tax
pairs(boston_scaled[c("dis", "medv", "black", "lstat", "tax")], col = km4$cluster)
# k-means clustering with 3
km3 <-kmeans(boston_scaled, centers = 3)
# plot the Boston dataset with clusters
# For this, we choose 5 variables - dis, medv, black, lstat and tax
pairs(boston_scaled[c("dis", "medv", "black", "lstat", "tax")], col = km3$cluster)
set.seed(100)
# Compute and plot cluster addition & variance explained for k = 2 to k = 15.
k.max <- 15
data <- boston_scaled
clust_TSS <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
clust_TSS
## [1] 6565.000 4207.600 3519.743 3098.744 2746.623 2399.515 2143.440
## [8] 1955.816 1832.813 1736.480 1637.171 1561.280 1498.560 1464.093
## [15] 1385.043
plot(1:k.max, clust_TSS,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
# k-means clustering with 4
km_bonus <-kmeans(boston_scaled, centers = 4)
# Linear discriminant analysis with clusters from k-means as target
mat <- as.matrix(km_bonus$cluster)
cluster_target <- mat[ rownames(mat) %in% rownames(train), ]
lda.fit.bonus <- lda(cluster_target ~., data = train)
# target classes as numeric
classes <- as.numeric(cluster_target)
# Plot the lda results
plot(lda.fit.bonus, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 2)
Run the code below for the (scaled) train data that you used to fit the LDA. The code creates a matrix product, which is a projection of the data points.
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
# Plot with crime class in train
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type = 'scatter3d', mode = 'markers', color = train$crime)
# Plot with k-means clusters
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type = 'scatter3d', mode = 'markers', color = cluster_target)